Pelagophyceae
Initialize
library(pr2database)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(tibble)
library(readr)
library(maps)
library(cowplot)
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
library(patchwork)
# data("pr2")## Warning: package 'knitr' was built under R version 3.5.3
PR2
Use the PR2 on Google
# Info for the local dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
filter (is.na(removed_version)) %>%
collect()
pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version)) %>%
collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>%
collect()
# Join the tables and keep only sequences that are not removed
pr2 <- pr2_main %>%
left_join(pr2_taxo, by = c("species"="species")) %>%
left_join (pr2_seq) %>%
left_join (pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>%
filter (is.na(removed_version))Export Pelagophyceae
pr2_export <- pr2 %>%
filter(class == "Pelagophyceae") %>%
arrange(class, order, family, genus, species)
pr2_export(pr2_export, "../pr2/pr2_4_12_Pelagophyceae.fasta.gz", file_type = "fasta", file_format = "fasta_taxo_long")
pr2_export(pr2_export, "../pr2/pr2_4_12_Pelagophyceae.xlsx", file_type = "merged_excel")
pr2_export_taxonomy <- pr2_export %>%
count(class, order, family, genus, species)
knitr::kable(pr2_export_taxonomy)
openxlsx::write.xlsx(pr2_export_taxonomy, file="../pr2/pr2_Pelagoophyceae_taxonomy.xlsx", na="")RCC
Read RCC
db_rcc <- db_connect(db_info("rcc_local") )
cultures <- tbl(db_rcc, "cultures") %>%
collect()
taxonomy <- tbl(db_rcc, "taxonomy") %>%
collect()
cultures <- left_join(cultures, taxonomy)
# cultures <- cultures %>% filter(lost==0)
cultures <- cultures %>% mutate(year_entered = lubridate::year(date_entered_catalog),
year_sampled = lubridate::year(lubridate::parse_date_time(sampling_date, "ymd")),
year_lost = lubridate::year(lost_date),
latitude = mapply(lat_long_dec,sampling_lat_deg,sampling_lat_mn,sampling_lat_ns,"latitude"),
longitude = mapply(lat_long_dec,sampling_long_deg,sampling_long_mn,sampling_long_ew,"longitude"))
sequences <- tbl(db_rcc, "sequences") %>% collect()
db_disconnect(db_rcc)
# Build some tables -------------------------------------------------------
sequences_18S <- sequences %>%
filter(str_detect(gene_name, "18S")) %>%
select(rcc_id, genbank_accession, sequence) %>%
rename(accession_18S = genbank_accession, sequence_18S = sequence) %>%
mutate(sequence_18S_length = str_length(sequence_18S))
sequences_18S_longer <- sequences_18S %>%
arrange(rcc_id, desc(sequence_18S_length)) %>%
distinct(rcc_id, .keep_all = TRUE)
sequences_ITS <- sequences %>%
filter(str_detect(gene_name, "ITS")) %>%
select(rcc_id, genbank_accession, sequence) %>%
rename(accession_ITS = genbank_accession, sequence_ITS = sequence)Construct and export tables
cultures_Pelago <- cultures %>%
# filter(lost==0) %>%
filter(class == "Pelagophyceae") %>%
arrange(order, family, genus, species, rcc_id) %>%
select(order, family, genus, species, rcc_id, lost_date,
strain_name, strain_name_synonyms,
sampling_ocean, sampling_regional_sea , sampling_depth,
sampling_date, latitude, longitude) %>%
left_join(sequences_18S_longer) %>%
left_join(sequences_ITS)
openxlsx::write.xlsx(cultures_Pelago, "../rcc/RCC_Pelago.xlsx", na="")Export fasta
cultures_Pelago_18S <- cultures_Pelago %>%
filter(!is.na(accession_18S)) %>%
mutate(sequence_name = str_c(accession_18S, str_c("RCC", rcc_id), strain_name,species, sep="|" ))
seq_out <- DNAStringSet(cultures_Pelago_18S$sequence_18S) # Store the sequence in a DNAString
names(seq_out) <- cultures_Pelago_18S$sequence_name
writeXStringSet(seq_out, "C:/Users/vaulot/Desktop/RCC_Pelago_18S.fasta", compress=FALSE) Metabarcodes
Datasets included
All public data sets
datasets <- metapr2_export_datasets() %>%
filter(public == 1) %>%
filter(processing_pipeline_metapr2 %in% c("dada2", "swarm"))
datasets %>%
select(dataset_id, dataset_code, dataset_name, gene_region) %>%
arrange(gene_region) %>%
knitr::kable() | dataset_id | dataset_code | dataset_name | gene_region |
|---|---|---|---|
| 1 | OSD_2014_V4_LGC | Ocean Sampling Day - 2014 - V4 LGC | V4 |
| 2 | OSD_2015_V4 | Ocean Sampling Day - 2015 - V4 | V4 |
| 3 | OSD_2014_V4_LW | Ocean Sampling Day - 2014 - V4 LW | V4 |
| 5 | MALINA_Monier_2014 | Arctic Ocean, Beaufort Sea, MALINA cruise - 2009 | V4 |
| 6 | Arctic_Stecher_2016 | Central Arctic Ocean - 2012 | V4 |
| 9 | Nansen_Metfies_2016 | Nansen Basin - 2012 | V4 |
| 11 | Fieldes_Bay_Luo_2016 | Fieldes Bay, Antarctic - 2013 | V4 |
| 19 | Baltic_Sea_2012_2013 | Gulf of Finland - 2012-2013 | V4 |
| 20 | Oslo_fjord_2009_2011 | Oslo fjord - 2009-2011 | V4 |
| 33 | Tsunami_18S | Tsunami deposits 18S R1 | V4 |
| 34 | Malaspina_vertical_V4 | Malaspina expedition - vertical profiles - 2010-2011 | V4 |
| 35 | Malaspina_surface_V4 | Malaspina expedition - surface - 2010-2011 | V4 |
| 36 | Blanes_2004_2013 | Blanes Time Series - 2004-2013 | V4 |
| 37 | Baffin_Bay_Joli_2013 | Baffin Bay - 2013 | V4 |
| 38 | White_Sea_2013_2015 | White Sea - 2013-2015 | V4 |
| 39 | Arctic_Ocean_PS80_2012 | Arctic Ocean - Polarstern expedition ARK-XXVII/3 - 2012 | V4 |
| 40 | Arctic_Ocean_Survey | Arctic Ocean Survey - 2005-2011 | V4 |
| 41 | Chukchi_Sea_ICESCAPE | Chukchi Sea - ICESCAPE - 2010 | V4 |
| 42 | Arctic_Nares_Strait | Nares Strait - 2014 | V4 |
| 43 | Baltic_Gdansk_2012 | Gdansk Gulf - 2012 | V4 |
| 44 | Baltic_Gdansk_2012_Hapto | Gdansk Gulf - 2012 Hapto | V4 |
| 49 | Naples_2011 | Bay of Naples - 2011 | V4 |
| 53 | Biomarks | Biomarks | V4 |
| 15 | Tara_Oceans | Tara Oceans - 2009-2012 | V9 |
datasets_selected <- list()
datasets_selected[["all"]] <- datasets %>%
filter(dataset_id %in% c(1, 2, 15, 34, 35, 36, 53))
cat("Data sets selected")Data sets selected
datasets_selected[["all"]] %>%
select(dataset_id, dataset_code, dataset_name, gene_region) %>%
arrange(gene_region) %>%
knitr::kable()| dataset_id | dataset_code | dataset_name | gene_region |
|---|---|---|---|
| 1 | OSD_2014_V4_LGC | Ocean Sampling Day - 2014 - V4 LGC | V4 |
| 2 | OSD_2015_V4 | Ocean Sampling Day - 2015 - V4 | V4 |
| 34 | Malaspina_vertical_V4 | Malaspina expedition - vertical profiles - 2010-2011 | V4 |
| 35 | Malaspina_surface_V4 | Malaspina expedition - surface - 2010-2011 | V4 |
| 36 | Blanes_2004_2013 | Blanes Time Series - 2004-2013 | V4 |
| 53 | Biomarks | Biomarks | V4 |
| 15 | Tara_Oceans | Tara Oceans - 2009-2012 | V9 |
Summarize functions
Update and complement data
- Add combination of samples and species that have 0 reads
- Add class specific information
update_asv_set <- function(one_asv_set) {
# one_asv_set <- asv_set[["V4"]]
one_asv_set$samples <- one_asv_set$samples %>%
filter(!is.na(file_code) ) %>%
filter(file_code !="") %>%
filter(!is.na(DNA_RNA)) # Some Tara files do not correspond to any samples...
# Find samples which have no pelagophytes
samples_no_pelago <- one_asv_set$samples %>%
filter(!(file_code %in% one_asv_set$df$file_code))
# Extract just file_code, species and n_reads
samples_species_counts <- one_asv_set$df %>%
select(file_code, species, n_reads)
taxo <- one_asv_set$df %>%
select(species, order, family, genus, species) %>%
distinct()
# Create a full table with all combinations of samples and species
samples_species <- bind_rows(one_asv_set$df, samples_no_pelago) %>%
expand(file_code, species) %>%
filter(!is.na(species)) %>%
left_join(one_asv_set$samples) %>%
left_join(taxo) %>%
left_join(samples_species_counts) %>%
mutate(n_reads = replace_na(n_reads, 0))
one_asv_set$df <- samples_species
# Normalize by total number of Pelagophyceae reads
asv_class <- one_asv_set$df %>%
group_by(file_code) %>%
summarise(reads_class = sum(n_reads))
one_asv_set$df <- left_join(one_asv_set$df, asv_class) %>%
mutate (pct_class = n_reads/reads_class*100,
pct_total = n_reads/reads_total*100,
pct_pelago_total = reads_class/reads_total*100)
return(one_asv_set)
}Statistics
summarise_asv_set <- function(one_asv_set, directory) {
# one_asv_set <- filter(asv_set[["V4"]]$df, dataset_id != 36)
# directory = "../metabarcodes/V4/"
summary_species <- phyloseq_long_treemap(one_asv_set$df, genus, species, "Species" )
summary_pct <- one_asv_set$df %>%
group_by(species) %>%
summarise(pct_class = mean(pct_class, na.rm = TRUE),
pct_total = mean(pct_total, na.rm = TRUE))
summary_species <- left_join(summary_species$df, summary_pct)
openxlsx::write.xlsx(summary_species, str_c(directory, "summary_species.xlsx"))
summary_substrate <- one_asv_set$samples %>%
count(substrate)
summary_depth <- one_asv_set$samples %>%
count(depth_level)
summary_fraction <- one_asv_set$samples %>%
mutate(fraction_label = str_c(format(fraction_min, nsmall=2, scientific = FALSE),
format(fraction_max, nsmall=2, scientific = FALSE), sep="_") ) %>%
count(fraction_label)
return(list(summary_species, summary_substrate, summary_depth, summary_fraction))
}Maps
map_asv_set <- function(one_asv_set, directory) {
# one_asv_set <- filter(asv_set[["V4"]]$df, dataset_id != 36)
# directory = "../metabarcodes/V4/"
world <- map_data("world")
# g <- ggplot() +
# geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="grey") +
# coord_fixed(1.3) +
# geom_point(data=one_asv_set, aes(x=longitude, y=latitude, size=pct_class), fill="blue", shape=21) +
# # scale_size_area(trans="log10") +
# ggtitle("Percent of Pelagophyceae") +
# facet_wrap(vars(species))
# print(g)
df <- one_asv_set %>%
select(longitude, latitude, pct_pelago_total) %>%
rename(z=pct_pelago_total) %>%
distinct()
gg <- map_distribution(df, map_title = "Pelagophyceae - % of Total reads",
z_limits = c(0, 10),
z_breaks = 1*c(1, 2, 4, 6, 8, 10))
print(gg)
ggsave(plot= gg , filename=str_c(directory,"map_Pelago_pct_total.pdf"),
width = 15 , height = 12, scale=1.80,
units="cm", useDingbats=FALSE)
for (one_species in unique(one_asv_set$species)) {
df <- one_asv_set %>%
filter(species == one_species) %>%
select(longitude, latitude, pct_class) %>%
rename(z=pct_class)
gg <- map_distribution(df, map_title = str_c(one_species," - % of Pelagophyceae"))
print(gg)
ggsave(plot= gg , filename=str_c(directory,"maps_pct_pelago/map_",one_species,"_pct_pelago.pdf"),
width = 15 , height = 12, scale=1.80,
units="cm", useDingbats=FALSE)
}
# for (one_species in unique(one_asv_set$species)) {
# df <- one_asv_set %>%
# filter(species == one_species) %>%
# select(longitude, latitude, pct_total) %>%
# rename(z=pct_total)
# gg <- map_distribution(df, map_title = str_c(one_species," - % of Total reads"),
# z_limits = c(0, 1),
# z_breaks = 0.1*c(1, 2, 4, 6, 8, 10))
# print(gg)
# ggsave(plot= gg , filename=str_c(directory,"maps_pct_total/map_",one_species,"_pct_total.pdf"),
# width = 15 , height = 12, scale=1.80,
# units="cm", useDingbats=FALSE)
# }
return(TRUE)
}Time series
ts_asv_set <- function(one_asv_set, directory) {
# one_asv_set <- filter(asv_set[["V4"]]$df, dataset_id == 36)
df <- one_asv_set %>%
mutate(replicate = replace_na(replicate, 1)) %>%
filter(replicate == 1) %>%
select(date, species, date, fraction_name, pct_class) %>%
mutate(date = lubridate::as_date(date))
gg <- ggplot(df, aes(x = date, y = pct_class, fill = species)) +
geom_col(width=30) +
theme_bw() +
scale_fill_viridis_d() +
xlab("") + ylab("Proportion") +
facet_grid(rows = vars(fraction_name) ) +
scale_x_date()
print(gg)
ggsave(plot= gg , filename=str_c(directory,"Blanes_time_series.pdf"),
width = 15 , height = 12, scale=1.80,
units="cm", useDingbats=FALSE)
return(TRUE)
}Get the data
# Export data
asv_set <- list()
for (one_region in regions) {
asv_set[[one_region]] <- metapr2_export_asv(taxo_level = class,
taxo_name="Pelagophyceae",
dataset_id_selected = datasets_selected[[one_region]]$dataset_id,
export_fasta = TRUE,
export_wide_xls = TRUE,
export_sample_xls = TRUE,
export_phyloseq = FALSE,
directory = str_c("../metabarcodes/",one_region, "/"),
taxonomy_full= TRUE,
boot_min = 100,
boot_level = class_boot,
use_hash = TRUE)
saveRDS(asv_set[[one_region]], str_c("../metabarcodes/", one_region, "/asv_set.rda"))
} Summarise the data
- Maps are only for surface samples
V4 - OSD 2014-2015, Malaspina, Blanes, Biomarks
asv_set <- list()
one_region <- "V4"
asv_set[[one_region]] <- readRDS( str_c("../metabarcodes/", one_region, "/asv_set.rda"))
asv_set[[one_region]] <- update_asv_set(asv_set[[one_region]])
summaries <- summarise_asv_set(asv_set[[one_region]],
directory = str_c("../metabarcodes/", one_region, "/"))=============
=============
=============
=============
[[1]]
genus species n_reads pct_class pct_total
------------------------- ----------------------------- -------- ----------- ----------
Ankylochrysis Ankylochrysis_lutea 1124 0.3534281 0.0016651
Ankylochrysis Ankylochrysis_sp. 115 0.4789488 0.0002084
Aureococcus Aureococcus_anophagefferens 32697 7.9762765 0.0310217
Aureoumbra Aureoumbra_lagunensis 50 0.1202055 0.0001139
Chrysocystis Chrysocystis_fragilis 2799 1.4307706 0.0020358
Chrysoreinhardia Chrysoreinhardia_sp. 2 0.0000070 0.0000020
Pelagococcus Pelagococcus_sp. 6110 3.4071300 0.0075800
Pelagococcus Pelagococcus_subviridis 1 0.0003316 0.0000011
Pelagomonadaceae_clade_A Pelagomonadaceae_clade_A_sp. 23710 9.2576480 0.0249426
Pelagomonadaceae_X Pelagomonadaceae_X_sp. 351 0.1036844 0.0005932
Pelagomonas Pelagomonas_calceolata 720265 40.5003490 0.6885337
Pelagophyceae_Svalbard Pelagophyceae_Svalbard_sp. 498 0.2980440 0.0011230
Pelagophyceae_XXX Pelagophyceae_XXX_sp. 50968 11.2727535 0.0481332
Pulvinaria Pulvinaria_sp. 4 0.0001133 0.0000009
Sarcinochrysidaceae_X Sarcinochrysidaceae_X_sp. 800 1.5698927 0.0015097
Sarcinochrysis Sarcinochrysis_sp. 12502 7.6793874 0.0171452
[[2]]
substrate n
---------- -----
sediment 24
water 1000
[[3]]
depth_level n
------------- ----
anoxic 7
bathypelagic 63
bottom 24
DCM 45
euphotic 37
mesopelagic 60
surface 788
[[4]]
fraction_label n
--------------- ----
NA_ NA 24
0.00_ 0.20 9
0.20_ 3.00 120
0.22_ NA 293
0.22_ 3.00 303
0.80_ 3.00 38
3.00_ 20.00 205
20.00_2000.00 32
map_asv_set(filter(asv_set[[one_region]]$df,
dataset_id != 36,
depth_level=="surface"),
directory = str_c("../metabarcodes/", one_region, "/"))[1] TRUE
ts_asv_set(filter(asv_set[[one_region]]$df,
dataset_id == 36),
directory = str_c("../metabarcodes/", one_region, "/"))[1] TRUE
V9 - Tara Oceans
asv_set <- list()
one_region <- "V9"
asv_set[[one_region]] <- readRDS( str_c("../metabarcodes/", one_region, "/asv_set.rda"))
asv_set[[one_region]] <- update_asv_set(asv_set[[one_region]])
summarise_asv_set(asv_set[[one_region]],
directory = str_c("../metabarcodes/", one_region, "/"))[[1]]
# A tibble: 6 x 5
# Groups: genus [6]
genus species n_reads pct_class pct_total
<chr> <chr> <dbl> <dbl> <dbl>
1 Aureococcus Aureococcus_anophageffere~ 887561 16.6 0.0800
2 Pelagococcus Pelagococcus_sp. 25784 3.67 0.00374
3 Pelagomonadaceae_clade~ Pelagomonadaceae_clade_A_~ 16250 4.24 0.00157
4 Pelagomonadaceae_X Pelagomonadaceae_X_sp. 18632 2.77 0.00158
5 Pelagomonas Pelagomonas_calceolata 3210461 70.9 0.311
6 Sarcinochrysidaceae_X Sarcinochrysidaceae_X_sp. 473 0.222 0.0000379
[[2]]
# A tibble: 1 x 2
substrate n
<chr> <int>
1 water 884
[[3]]
# A tibble: 3 x 2
depth_level n
<chr> <int>
1 euphotic 306
2 mesopelagic 70
3 surface 508
[[4]]
# A tibble: 12 x 2
fraction_label n
<chr> <int>
1 " 0.22_ 3.00" 1
2 " 0.80_ NA" 112
3 " 0.80_ 3.00" 29
4 " 0.80_ 5.00" 175
5 " 0.80_ 20.00" 8
6 " 0.80_ 200.00" 2
7 " 3.00_ NA" 35
8 " 3.00_ 20.00" 1
9 " 5.00_ 20.00" 165
10 " 20.00_ 180.00" 168
11 " 20.00_ 200.00" 8
12 "180.00_2000.00" 180
map_asv_set(filter(asv_set[[one_region]]$df,
dataset_id != 36,
depth_level=="surface"),
directory = str_c("../metabarcodes/", one_region, "/"))[1] TRUE
Preliminary analyses - DO NOT RUN
Check that all sequence with same hash have same taxonomy
This check was done before the hash analysis was done
asv_sequence_V4_hash_repeated <- asv_set_V4$fasta %>%
group_by(sequence_hash, species) %>%
summarise(n = n()) %>%
filter(n>1) %>%
arrange(sequence_hash)
asv_sequence_V4_all <- asv_set_V4$fasta %>%
arrange(sequence_hash)
asv_sequence_V4_dereplicated <- asv_set_V4$fasta %>%
group_by(sequence_hash) %>%
slice(1) %>%
ungroup()